import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
import copy
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
df_train=pd.read_csv('wine_train.csv')
df_test=pd.read_csv('wine_test.csv')
df_train.head()
| Type | Alcohol | Malic | Ash | Alcalinity | Magnesium | Phenols | Flavanoids | Nonflavanoids | Proanthocyanins | Color | Hue | Dilution | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 13.77 | 1.90 | 2.68 | 17.1 | 115 | 3.00 | 2.79 | 0.39 | 1.68 | 6.30 | 1.13 | 2.93 | 1375 |
| 1 | 1 | 13.94 | 1.73 | 2.27 | 17.4 | 108 | 2.88 | 3.54 | 0.32 | 2.08 | 8.90 | 1.12 | 3.10 | 1260 |
| 2 | 1 | 13.75 | 1.73 | 2.41 | 16.0 | 89 | 2.60 | 2.76 | 0.29 | 1.81 | 5.60 | 1.15 | 2.90 | 1320 |
| 3 | 1 | 12.85 | 1.60 | 2.52 | 17.8 | 95 | 2.48 | 2.37 | 0.26 | 1.46 | 3.93 | 1.09 | 3.63 | 1015 |
| 4 | 1 | 13.63 | 1.81 | 2.70 | 17.2 | 112 | 2.85 | 2.91 | 0.30 | 1.46 | 7.30 | 1.28 | 2.88 | 1310 |
df_train.isnull().sum()
Type 0 Alcohol 0 Malic 0 Ash 0 Alcalinity 0 Magnesium 0 Phenols 0 Flavanoids 0 Nonflavanoids 0 Proanthocyanins 0 Color 0 Hue 0 Dilution 0 Proline 0 dtype: int64
df_train.describe()
| Type | Alcohol | Malic | Ash | Alcalinity | Magnesium | Phenols | Flavanoids | Nonflavanoids | Proanthocyanins | Color | Hue | Dilution | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 | 123.000000 |
| mean | 1.934959 | 13.045285 | 2.387154 | 2.377398 | 19.604065 | 99.105691 | 2.293496 | 2.040163 | 0.362033 | 1.572439 | 5.122276 | 0.949561 | 2.614146 | 745.341463 |
| std | 0.776075 | 0.817379 | 1.111320 | 0.283956 | 3.605492 | 12.958201 | 0.629254 | 1.019045 | 0.123308 | 0.556818 | 2.329248 | 0.224467 | 0.732045 | 328.719693 |
| min | 1.000000 | 11.450000 | 0.890000 | 1.360000 | 10.600000 | 78.000000 | 0.980000 | 0.340000 | 0.140000 | 0.410000 | 1.280000 | 0.540000 | 1.270000 | 278.000000 |
| 25% | 1.000000 | 12.370000 | 1.655000 | 2.225000 | 17.050000 | 88.000000 | 1.770000 | 1.095000 | 0.270000 | 1.235000 | 3.260000 | 0.775000 | 1.890000 | 495.000000 |
| 50% | 2.000000 | 13.050000 | 1.900000 | 2.380000 | 19.500000 | 97.000000 | 2.400000 | 2.110000 | 0.340000 | 1.550000 | 4.900000 | 0.960000 | 2.780000 | 650.000000 |
| 75% | 3.000000 | 13.725000 | 3.170000 | 2.600000 | 21.550000 | 106.500000 | 2.800000 | 2.895000 | 0.430000 | 1.955000 | 6.250000 | 1.120000 | 3.205000 | 1002.500000 |
| max | 3.000000 | 14.830000 | 5.800000 | 3.230000 | 30.000000 | 139.000000 | 3.880000 | 5.080000 | 0.660000 | 2.960000 | 13.000000 | 1.420000 | 4.000000 | 1547.000000 |
It is obvious that 'Type' column is still int, but it should be categorial variable. Thus, we convert it into object type.
df_train.Type=df_train.Type.astype(str)
df_test.Type=df_test.Type.astype(str)
df_train.describe(include='object')
| Type | |
|---|---|
| count | 123 |
| unique | 3 |
| top | 2 |
| freq | 49 |
df_train.Type.value_counts()
2 49 1 41 3 33 Name: Type, dtype: int64
df_test.Type.value_counts()
2 22 1 18 3 15 Name: Type, dtype: int64
The proportion of different Type in test and train data is similar, so we could go ahead with further analysis.
df_train.plot(kind='box',subplots=True,sharey=False,figsize=(20, 5));
It seems that all the varibles make sense, and there is no much outliers. Thus, we finish our data preprocessing and begin to conduct EDA and models.
sns.pairplot(df_train, diag_kind="kde", hue='Type', height=2);
From the pairplot, we can see that 'Alcohol', 'Flavanoids', 'Proline' with other columns' two dimensions graphs obviously let the three class separated. Also, 'Alcohol', 'Flavanoids', 'Proline' distribution make classes separated.
df_train.plot(kind='box',by='Type',
subplots=True,
sharey=False, # use different y scale
figsize=(20, 5));
From the boxplot, we could conclude that columns: 'Alcohol', 'Phenols', 'Flavanoids' could make the three Type divided, since the interquartile ranges for the three Type are cross with one another in all these 'Alcohol', 'Phenols', 'Flavanoids' columns.
X_train = df_train.drop('Type',axis=1)
y_train = df_train.Type
X_test = df_test.drop('Type',axis=1)
y_test = df_test.Type
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
LinearDiscriminantAnalysis()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearDiscriminantAnalysis()
train_accuracy = lda_model.score(X_train, y_train)
1 - train_accuracy
0.0
test_accuracy = lda_model.score(X_test, y_test)
LDA_testerror=1 - test_accuracy
LDA_testerror
0.018181818181818188
wine_train_ld = lda_model.transform(X_train)
plt.figure()
colors = ['red', 'green', 'blue']
for color, class_name in zip(colors, lda_model.classes_):
plt.scatter(wine_train_ld[y_train == class_name, 0], wine_train_ld[y_train == class_name, 1],
alpha=.8, color=color,label=class_name)
plt.legend(loc='best')
plt.xlabel("LD1")
plt.ylabel("LD2")
plt.show()
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)
QuadraticDiscriminantAnalysis()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
QuadraticDiscriminantAnalysis()
train_accuracy = qda_model.score(X_train, y_train)
1 - train_accuracy
0.0
test_accuracy = qda_model.score(X_test, y_test)
QDA_testerror=1 - test_accuracy
QDA_testerror
0.036363636363636376
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
train_accuracy = nb_model.score(X_train, y_train)
1 - train_accuracy
0.016260162601625994
test_accuracy = nb_model.score(X_test, y_test)
NB_testerror=1 - test_accuracy
NB_testerror
0.036363636363636376
X_train.corr()
| Alcohol | Malic | Ash | Alcalinity | Magnesium | Phenols | Flavanoids | Nonflavanoids | Proanthocyanins | Color | Hue | Dilution | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alcohol | 1.000000 | -0.008885 | 0.193497 | -0.288033 | 0.295328 | 0.385383 | 0.299960 | -0.200444 | 0.276881 | 0.521001 | 0.031352 | 0.116683 | 0.656093 |
| Malic | -0.008885 | 1.000000 | 0.197431 | 0.342293 | -0.056300 | -0.347225 | -0.421671 | 0.307300 | -0.256818 | 0.194271 | -0.575148 | -0.388483 | -0.242436 |
| Ash | 0.193497 | 0.197431 | 1.000000 | 0.479684 | 0.423616 | 0.144842 | 0.114447 | 0.185207 | 0.059243 | 0.320230 | -0.080704 | -0.012247 | 0.252081 |
| Alcalinity | -0.288033 | 0.342293 | 0.479684 | 1.000000 | -0.000781 | -0.326835 | -0.343670 | 0.351220 | -0.225598 | 0.064881 | -0.323579 | -0.276207 | -0.419323 |
| Magnesium | 0.295328 | -0.056300 | 0.423616 | -0.000781 | 1.000000 | 0.259678 | 0.251872 | -0.225951 | 0.133979 | 0.227461 | 0.086653 | 0.094856 | 0.444833 |
| Phenols | 0.385383 | -0.347225 | 0.144842 | -0.326835 | 0.259678 | 1.000000 | 0.868022 | -0.421982 | 0.653261 | 0.012345 | 0.453964 | 0.703416 | 0.519609 |
| Flavanoids | 0.299960 | -0.421671 | 0.114447 | -0.343670 | 0.251872 | 0.868022 | 1.000000 | -0.500535 | 0.689941 | -0.138198 | 0.565990 | 0.790985 | 0.496677 |
| Nonflavanoids | -0.200444 | 0.307300 | 0.185207 | 0.351220 | -0.225951 | -0.421982 | -0.500535 | 1.000000 | -0.332991 | 0.124318 | -0.279832 | -0.483187 | -0.308228 |
| Proanthocyanins | 0.276881 | -0.256818 | 0.059243 | -0.225598 | 0.133979 | 0.653261 | 0.689941 | -0.332991 | 1.000000 | 0.037539 | 0.312483 | 0.528544 | 0.366672 |
| Color | 0.521001 | 0.194271 | 0.320230 | 0.064881 | 0.227461 | 0.012345 | -0.138198 | 0.124318 | 0.037539 | 1.000000 | -0.464077 | -0.408768 | 0.316594 |
| Hue | 0.031352 | -0.575148 | -0.080704 | -0.323579 | 0.086653 | 0.453964 | 0.565990 | -0.279832 | 0.312483 | -0.464077 | 1.000000 | 0.572618 | 0.308091 |
| Dilution | 0.116683 | -0.388483 | -0.012247 | -0.276207 | 0.094856 | 0.703416 | 0.790985 | -0.483187 | 0.528544 | -0.408768 | 0.572618 | 1.000000 | 0.317553 |
| Proline | 0.656093 | -0.242436 | 0.252081 | -0.419323 | 0.444833 | 0.519609 | 0.496677 | -0.308228 | 0.366672 | 0.316594 | 0.308091 | 0.317553 | 1.000000 |
We calculate the correlation matrix for different variables, and we could notice ('Proline','Alcohol'), ('Phenols','Dilution'),('Phenols','Flavanoids'), etc. have relative high correlation. Thus, the assumption of Navie Bayes might not hold, which causes the relative poor performance of NB. Thus, we delete highly related variables, and re-do NB model.
nb_model_new = GaussianNB()
nb_model_new.fit(X_train.drop(['Phenols','Dilution'],axis=1), y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
train_accuracy2 = nb_model_new.score(X_train.drop(['Phenols','Dilution'],axis=1), y_train)
1 - train_accuracy
0.016260162601625994
test_accuracy = nb_model.score(X_test, y_test)
NB_testerror_new=1 - test_accuracy
NB_testerror_new
0.036363636363636376
The performance of new NB is still not so well.
classifier=['LDA','QDA','NB','NB_new']
testerror=[LDA_testerror,QDA_testerror,NB_testerror,NB_testerror_new]
fig = plt.figure(figsize = (10, 5))
plt.bar(classifier, testerror, color ='maroon',
width = 0.4)
plt.xlabel("Classifier")
plt.ylabel("test error")
plt.title("Different test error between classifier")
plt.show()
From the barplot, we could see LDA method gives the best performance with lowest test error.